This dataset is a selection of data from a publicly available
repository of the National Cancer Institute, i.e. the NCI-60 dataset,
which includes gene expression analysis as well as data from
metabolomics and proteomics experiments. It provides experimental data
obtained from 60 human cancer cell lines derived from nine tissue
origins, such as breast, colon, lung, ovary, blood and skin. These cell
lines constitute key in vitro models for cancer research and they are
used for extensive anti-cancer drug screening.
R.H. Shoemaker, The NCI60 human tumour cell line anticancer drug screen,
Nat. Rev. Cancer 6 (2006) 813–823.
Some variables with near zero variance are removed from the metabolomics dataset.
load("../data/NCI60_custom.RData")
genes <- NCI60_custom$transcripto$data
metabo <- NCI60_custom$metabo$data
proteo <- NCI60_custom$proteo$data
# remove near zero variance features in metabolomics data
metaboNZV <- caret::nearZeroVar(metabo, saveMetrics = TRUE)
metabo <- metabo[, metaboNZV[,"zeroVar"]==FALSE]
data <- list(genes = genes,
metabo = metabo,
proteo = proteo)
metadata <- data.frame(cell.line = NCI60_custom$metabo$splMD$Cell.line, origin = NCI60_custom$metabo$splMD$Origin)
rownames(metadata) <- NCI60_custom$metabo$splMD$splID
Principal Component Analysis is run on each centered and scaled dataset.
pca.res <- lapply(data, function(x) {
prcomp(x, center=TRUE, scale.=TRUE)
})
varPercent <- lapply(pca.res, function(x) {x$sdev^2/sum(x$sdev^2) * 100})
for(omic in c("genes", "metabo", "proteo")){
barplot(varPercent[[omic]], xlab='PC', ylab='Percent Variance', names.arg=1:length(varPercent[[omic]]), main=omic)
}
for(omic in c("genes", "metabo", "proteo")){
scores <- data.frame(metadata, pca.res[[omic]]$x)
p <- ggplot(scores, aes(x=PC1, y=PC2, col=origin)) +
geom_point(size=4) +
labs(x=paste0("PC1 - ", round(varPercent[[omic]][1], digits=2), "%"),
y=paste0("PC2 - ", round(varPercent[[omic]][2], digits=2), "%"),
title = paste0(omic, " - scores plots PC1 vs PC2")) +
theme_light()
print(p)
}
for(omic in c("genes", "metabo", "proteo")){
loadings <- data.frame(pca.res[[omic]]$rotation)
loadings$variables <- rownames(loadings)
p <- ggplot(loadings, aes(x=PC1, y=PC2, label=variables)) +
geom_point(size=1) +
geom_text_repel(size=1) +
labs(x=paste0("PC1 - ", round(varPercent[[omic]][1], digits=2), "%"),
y=paste0("PC2 - ", round(varPercent[[omic]][2], digits=2), "%"),
title = paste0(omic, " - loadings plots PC1 vs PC2")) +
theme_light()
print(p)
}
A Partial Least Square Discriminant Analysis is done on colon and ovarian tissues based on each omic data.
data.pls <- list(genes = genes[metadata$origin %in% c("Colon", "Ovarian"),],
metabo = metabo[metadata$origin %in% c("Colon", "Ovarian"),],
proteo = proteo[metadata$origin %in% c("Colon", "Ovarian"),])
metadata.pls <- metadata[metadata$origin %in% c("Colon", "Ovarian"),]
Y <- data.frame("Colon" = ifelse(metadata.pls$origin == "Colon", 1, 0),
"Ovarian" = ifelse(metadata.pls$origin == "Ovarian", 1, 0))
pls.res <- lapply(data.pls, function(x) {
pls(X=x, Y=Y, ncomp=2, scale=TRUE, mode="regression")
})
for(omic in c("genes", "metabo", "proteo")){
scores <- data.frame(metadata.pls, pls.res[[omic]]$variates$X)
p <- ggplot(scores, aes(x=comp1, y=comp2, col=origin)) +
geom_point(size=4) +
labs(x="latent variable 1",
y="latent variable 2",
title = paste0(omic, " - scores plots LV1 vs LV2")) +
theme_light()
print(p)
}
for(omic in c("genes", "metabo", "proteo")){
loadings <- data.frame(pls.res[[omic]]$loadings$X)
loadings$variables <- rownames(loadings)
p <- ggplot(loadings, aes(x=comp1, y=comp2, label=variables)) +
geom_point(size=1) +
geom_text_repel(size=1) +
labs(x=paste0("LV1"),
y=paste0("LV2"),
title = paste0(omic, " - loadings plots LV1 vs LV2")) +
theme_light()
print(p)
}
# prepare dataset
ComDim_data <- cbind.data.frame(scale(genes), scale(metabo), scale(proteo))
n_group <- c(dim(genes)[[2]], dim(metabo)[[2]], dim(proteo)[[2]])
# run analysis
ComDim_res <- ComDim(X = ComDim_data, group = n_group, plotgraph = F)
All three omics data contribute to the first dimension, the second dimension is driven by the metabolomics data and the third one by genes and proteomics data.
# saliences
saliences <- ComDim_res$saliences
rownames(saliences) <- c("genes", "metabo", "proteo")
saliences <- as.data.frame(t(saliences[,1:4]))
saliences$Dim <- rownames(saliences)
saliences <- melt(saliences)
ggplot(saliences, aes(x=Dim, y=value, fill=variable)) +
geom_bar(stat = "identity", position=position_dodge()) +
theme_light() +
labs(x = "global components", y = "specific weights of block on global components", fill = "omic",
title = "saliences")
# block contributions
contributions <- ComDim_res$contrib
rownames(contributions) <- c("genes", "metabo", "proteo")
contributions <- as.data.frame(t(contributions[,1:4]))
contributions$Dim <- rownames(contributions)
contributions <- melt(contributions)
ggplot(contributions, aes(x=Dim, y=value, fill=variable)) +
geom_bar(stat = "identity", position=position_dodge()) +
theme_light() +
labs(x = "global components", y = "percentage of block contribution to global components", fill = "omic",
title = "blocks contributions")
Scores plots on the first 4 dimensions show that samples from the different origins cluster naturally based on genes, metabolomics and proteomics data.
scores <- data.frame(metadata, ComDim_res$T)
ggplot(scores, aes(x=Dim.1, y=Dim.2, col=origin)) +
geom_point(size=4) +
labs(x=paste0("Dim.1 - ", ComDim_res$cumexplained[1,"%explX"], "%"),
y=paste0("Dim.2 - ", ComDim_res$cumexplained[2,"%explX"], "%"),
title = "scores plots on Dim.1 Dim.2") +
theme_light()
ggplot(scores, aes(x=Dim.3, y=Dim.4, col=origin)) +
geom_point(size=4) +
labs(x=paste0("Dim.3 - ", ComDim_res$cumexplained[3,"%explX"], "%"),
y=paste0("Dim.4 - ", ComDim_res$cumexplained[4,"%explX"], "%"),
title = "scores plots on Dim.3 Dim.4") +
theme_light()
Here are the loadings plots on the 4 dimensions.
loadings <- data.frame(ComDim_res$globalcor)
loadings$omic <- c(rep("genes", dim(genes)[[2]]), rep("metabo", dim(metabo)[[2]]), rep("proteo", dim(proteo)[[2]]))
loadings$variable <- rownames(loadings)
ggplot(loadings, aes(x=X1, y=X2, col=omic, label=variable)) +
geom_point(siue = 2) +
geom_text_repel(size = 1, max.overlaps = 50, segment.size=0.1) +
labs(x=paste0("Dim.1 - ", ComDim_res$cumexplained[1,"%explX"], "%"),
y=paste0("Dim.2 - ", ComDim_res$cumexplained[2,"%explX"], "%"),
title = "loadings plots on Dim.1 Dim.2") +
theme_light()
ggplot(loadings, aes(x=X3, y=X4, col=omic, label=variable)) +
geom_point(siue = 2) +
geom_text_repel(size = 1, max.overlaps = 50, segment.size=0.1) +
labs(x=paste0("Dim.3 - ", ComDim_res$cumexplained[3,"%explX"], "%"),
y=paste0("Dim.4 - ", ComDim_res$cumexplained[4,"%explX"], "%"),
title = "loadings plots on Dim.3 Dim.4") +
theme_light()
Run block.plsda analysis with block.plsda() from mixomics package
# prepare data
blockPLS_data <- lapply(data.pls, scale)
origin <- as.factor(metadata$origin[metadata$origin %in% c("Colon", "Ovarian")])
# run analysis
blockPLS_res <- block.plsda(X = blockPLS_data, Y = origin, design = "full", all.outputs = T, ncomp = 10, near.zero.var = T)
Run perf() plot the results with plot() Run the analysis with optimal number of latent variables In this analysis one latent variable is sufficient to discriminate between these two tissue origins. We still compute a model with two latent variables to be able to plot scores and loadings on 2D graphs.
blockPLS_perf <- perf(blockPLS_res, validation = 'Mfold', folds = 7, nrepeat = 1, auc = TRUE, cpus=2, progressBar = FALSE)
plot(blockPLS_perf)
blockPLS_res <- block.plsda(X = blockPLS_data, Y = origin, design = "full", all.outputs = T, ncomp = 2, near.zero.var = T)
A permutation test run with DIABLO.test() from RVAideMemoire package shows that the true model is statistically different from randome ones.
blockPLS_permtest <- DIABLO.test(blockPLS_res, progress = FALSE)
blockPLS_permtest
##
## Permutation test based on cross-validation
##
## data: blockPLS_res
## DIABLO (2 components)
## 999 permutations
## CER = 0.042857, p-value = 0.001
Almost the same proportion of each omics data is used to build the first latent variable. Metabolites contributes slightly more.
blockPLS_expl <- do.call("rbind",blockPLS_res$AVE$AVE_X[1:3])
blockPLS_expl <- rbind(blockPLS_expl, blockPLS_res$AVE[["AVE_outer"]])
rownames(blockPLS_expl)[4] <- "outer_model"
blockPLS_expl <- melt(blockPLS_expl)
colnames(blockPLS_expl) <- c("block", "comp", "value")
ggplot(blockPLS_expl, aes(x=comp, y=value, fill=block)) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x="component",
y="% of explained variance") +
theme_light()
As aimed, the first latent variable discriminates between the two tissues origins, the second latent variable shows orthogonal intra group variation.
blockPLS_variates.weighted <- blockPLS_res$variates[c("genes", "metabo", "proteo")]
for(omic in c("genes", "metabo", "proteo")){
for(comp in c("comp1", "comp2")){
blockPLS_variates.weighted[[omic]][,comp] <- blockPLS_variates.weighted[[omic]][,comp] * blockPLS_res$weights[omic, comp]
}
}
blockPLS_scores.weighted <- abind(blockPLS_variates.weighted[c("genes", "metabo", "proteo")], along = 3)
blockPLS_scores.weighted <- apply(blockPLS_scores.weighted, c(1,2), mean)
blockPLS_scores.weighted <- data.frame(metadata.pls, blockPLS_scores.weighted)
ggplot(blockPLS_scores.weighted, aes(x=comp1, y=comp2, col=origin)) +
geom_point(size=4) +
theme_light()
Loadings are plotted on the 2 first latent variables, a barplot visualisation for each dataset could be used to be able to find the most discriminant variables in each dataset.
blockPLS_loadings_genes <- blockPLS_res$loadings$genes
blockPLS_loadings_metabo <- blockPLS_res$loadings$metabo
blockPLS_loadings_proteo <- blockPLS_res$loadings$proteo
blockPLS_loadings <- rbind.data.frame(blockPLS_loadings_genes, blockPLS_loadings_metabo, blockPLS_loadings_proteo)
blockPLS_loadings$omic <- c(rep("genes", dim(blockPLS_loadings_genes)[[1]]), rep("metabo", dim(blockPLS_loadings_metabo)[[1]]), rep("proteo", dim(blockPLS_loadings_proteo)[[1]]))
blockPLS_loadings$variable <- rownames(blockPLS_loadings)
ggplot(blockPLS_loadings, aes(x=comp1, y=comp2, col=omic, label=variable)) +
geom_point(size=2) +
geom_text_repel(size=2, max.overlaps = 25, segment.size=0.2) +
theme_light()